/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: December 2024

Inputs: 	pdb_Citation.csv (downloaded 5/22/2018)

Outputs: 	clean_citation.dta

*******************************************************************************/

clear all

import delimited using "${data_raw}PDB/pdb_Citation.csv", clear

* Rename variables
rename structureid structureId
rename publicationyear publicationYear
rename journalname journalName
rename volumeid volumeId
rename firstpage firstPage
rename lastpage lastPage
rename pubmedid pubmedId

	
* Fix some author names that were separated in raw data	
duplicates tag structureId, gen(dup) // Fix a couple of these duplicates
replace authors = "Menting, J.G., Whittaker, J., Margetts, M.B., Whittaker, L.J., Kong, G.K., Smith, B.J., Watson, C.J., Zakova, L., Kletvikova, E., Jiracek, J., Chan, S.J., Steiner, D.F., Dodson, G.G., Brzozowski, A.M., Weiss, M.A., Ward, C.W., Lawrence, M.C." if structureId == "5KQV"
replace authors = "Convery, M.A., Young, R.J., Senger, S., Hamblin, J.N., Chan, C., Toomey, J.R., Watson, N.S." if structureId == "4Y7A" 
replace authors = "Convery, M.A., Young, R.J., Senger, S., Hamblin, J.N., Chan, C., Toomey, J.R., Watson, N.S." if structureId == "4Y7B" 
duplicates drop
drop dup
isid structureId 
	
* Clean up journal names
replace journalName = lower(journalName)
replace journalName = strtrim(journalName)
	
replace journalName = "journal of biological chemistry" if journalName == "j.biol.chem." | journalName == "j. biol. chem." | journalName == "biol.chem."
replace journalName = "journal of molecular biology" if journalName == "j.mol.biol." | journalName == "j. mol. biol."
replace journalName = "proceedings of the national academy of sciences of the united states of america" if journalName == "proc.natl.acad.sci.usa" | journalName == "proc. natl. acad. sci. u.s.a."
replace journalName = "acta crystallographica section d-structural biology" if journalName == "acta crystallogr.,sect.d" | journalName == "acta crystallogr d struct biol" | journalName == "acta crystallogr. d biol. crystallogr."
replace journalName = "journal of medicinal chemistry" if journalName == "j.med.chem" | journalName == "j. med. chem." | journalName == "j.med.chem."
replace journalName = "nature structural & molecular biology" if journalName == "nat.struct.mol.biol." | journalName == "nat. struct. mol. biol."
replace journalName = "nature communications" if journalName == "nat commun"
replace journalName = "protein science" if journalName == "protein sci."
replace journalName = "proteins-structure function and bioinformatics" if journalName == "proteins"
replace journalName = "embo journal" if journalName == "embo j."
replace journalName = "nucleic acids research" if journalName == "nucleic acids res."
replace journalName = "journal of the american chemical society" if journalName == "j.am.chem.soc." | journalName == "j. am. chem. soc."
replace journalName = "bioorganic & medicinal chemistry letters" if journalName == "bioorg.med.chem.lett." | journalName == "biorg.med.chem" | journalName == "bioorg.med.chem." | journalName == "biorg. med. chem. lett." | journalName == "bioorg. med. chem. lett."
replace journalName = "scientific reports" if journalName == "sci rep"
replace journalName = "molecular cell" if journalName == "mol.cell"
replace journalName = "febs journal" if journalName == "febs j."
replace journalName = "acta crystallographica section f-structural biology communications" if journalName == "acta crystallogr.,sect.f" | journalName == "acta crystallogr f struct biol commun" | journalName == "acta crystallogr. sect. f struct. biol. cryst. commun."
replace journalName = "journal of virology" if journalName == "j.virol."
replace journalName = "biochemical journal" if journalName == "biochem.j." | journalName == "biochem. j."
replace journalName = "febs letters" if journalName == "febs lett."
replace journalName = "journal of structural biology" if journalName == "j.struct.biol."
replace journalName = "angewandte chemie-international edition" if journalName == "angew.chem.int.ed.engl." | journalName == "angew. chem. int. ed. engl."
replace journalName = "biochemical and biophysical research communications" if journalName == "biochem.biophys.res.commun." | journalName == "biochem.biophys.res.comm." | journalName == "biochem. biophys. res. commun."
replace journalName = "acs chemical biology" if journalName == "acs chem.biol." | journalName == "acs chem. biol."
replace journalName = "nature chemical biology" if journalName == "nat.chem.biol." | journalName == "nat. chem. biol."
replace journalName = "plos pathogens" if journalName == "plos pathog."
replace journalName = "chemistry & biology" if journalName == "chem.biol."
replace journalName = "biochimica et biophysica acta-general subjects" if journalName == "biochim.biophys.acta"
replace journalName = "european journal of biochemistry" if journalName == "euro.j.biochem." | journalName == "eur.j.biochem."
replace journalName = "journal of bacteriology" if journalName == "j.bacteriol."
replace journalName = "molecular microbiology" if journalName == "mol.microbiol."
replace journalName = "cell reports" if journalName == "cell rep" | journalName == "cell rep."
replace journalName = "genes & development" if journalName == "genes dev."
replace journalName = "plos biology" if journalName == "plos biol."
replace journalName = "folding & design" if journalName == "structure fold.des."
replace journalName = "journal of biological inorganic chemistry" if journalName == "j.biol.inorg.chem." | journalName == "j. biol. inorg. chem."
replace journalName = "biophysical journal" if journalName == "biophys.j."
replace journalName = "archives of biochemistry and biophysics" if journalName == "arch.biochem.biophys."
replace journalName = "journal of biomolecular nmr" if journalName == "j.biomol.nmr"
replace journalName = "antimicrobial agents and chemotherapy" if journalName == "antimicrob.agents chemother." | journalName == "antimicrob. agents chemother."
replace journalName = "bmc structural biology" if journalName == "bmc struct.biol."
replace journalName = "acs medicinal chemistry letters" if journalName == "acs med.chem.lett." | journalName == "acs med chem lett"
replace journalName = "journal of immunology" if journalName == "j.immunol."
replace journalName = "embo reports" if journalName == "embo rep."
replace journalName = "journal of biochemistry" if journalName == "j.biochem.(tokyo)"
replace journalName = "cancer research" if journalName == "cancer res."
replace journalName = "cancer discovery" if journalName == "cancer discov"
replace journalName = "journal of biomolecular structure and dynamics" if journalName == "j.biomol.struct.dyn."
replace journalName = "nature cell biology" if journalName == "nat.cell biol."
replace journalName = "organic & biomolecular chemistry" if journalName == "org.biomol.chem."
replace journalName = "acs chemical neuroscience" if journalName == "acs chem neurosci"
replace journalName = "antiviral research" if journalName == "antiviral res."
replace journalName = "archives of virology" if journalName == "arch. virol."
replace journalName = "bioscience reports" if journalName == "biosci.rep."
replace journalName = "bmc biology" if journalName == "bmc biol."
replace journalName = "cell research" if journalName == "cell res."
replace journalName = "cold spring harbor symposia on quantitative biology" if journalName == "cold spring harbor symp.quant.biol."
replace journalName = "current biology" if journalName == "curr.biol."
replace journalName = "dalton transactions" if journalName == "dalton trans"
replace journalName = "journal of chemical information and modeling" if journalName == "j chem inf model"
replace journalName = "journal of computer-aided molecular design" if journalName == "j comput aided mol des"
replace journalName = "journal of experimental botany" if journalName == "j. exp. bot."
replace journalName = "chemistry" if journalName == "j.agric.food chem."
replace journalName = "journal of cellular and molecular medicine" if journalName == "j.cell.mol.med."
replace journalName = "journal of clinical investigation" if journalName == "j.clin.invest."
replace journalName = "journal of pharmacology and experimental therapeutics" if journalName == "j.pharmacol.exp.ther."
replace journalName = "journal of physical chemistry b" if journalName == "j.phys.chem.b"
replace journalName = "molecular biology of the cell" if journalName == "mol biol cell"
replace journalName = "molecular cancer therapeutics" if journalName == "mol.cancer ther."
replace journalName = "molecular and cellular endocrinology" if journalName == "mol.cell.endocrinol."
replace journalName = "plant physiology" if journalName == "plant physiol."
replace journalName = "plos genetics" if journalName == "plos genet."
replace journalName = "protein engineering design & selection" if journalName == "protein eng."
replace journalName = "protein expression and purification" if journalName == "protein expr.purif."
replace journalName = "molecular and cellular biology" if journalName == "mol.cell.biol."
	
// this gets you about 90% of articles with cleaned names	

* Clean up and parse author names	
rename authors paperAuthor
split paperAuthor, parse(".,")   // this leads to 88 variables
forval n = 1/88 {
		replace paperAuthor`n' = subinstr(paperAuthor`n',".","",.)   // remove the periods from names for consistency 
		replace paperAuthor`n' = strtrim(paperAuthor`n')   // remove any leading / trailing spaces
}
	
gen firstPaperAuthor = paperAuthor1
gen lastPaperAuthor = paperAuthor1
forval n = 2/88 {
	replace lastPaperAuthor = paperAuthor`n' if paperAuthor`n' != ""
}
	
* Count number of authors
gen numPaperAuthors = 1
forval n = 2/88 {
	replace numPaperAuthors = `n' if paperAuthor`n' != ""
}
	
save "${data_clean}clean_citation.dta", replace
